global functions (convert key to dummy)
source("thesis_functions.R")
data :D
data <- fread("kpopdata.csv")
data <- mutate(data, ArtistType = as.factor(ArtistType),
ArtistGender = as.factor(ArtistGender),
ArtistGen = factor(ArtistGen),
release_date = as.POSIXct(release_date, format = "%m/%d/%y"),
key = factor(key, levels = 0:11),
mode = as.factor(mode),
time_signature = factor(time_signature, levels = c(1,3,4,5)),
popular = factor(ifelse(popularity >=50, 1, 0)))
understanding popularity
hist(data$popularity)
summary(data$popularity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 11.00 23.00 25.88 39.00 94.00
heavily skewed right.
The square root transformation makes it more normal. This will help to meet the multiple linear regression assumptions.
hist(data$popularity^0.5)
Classification of
table(data$popular)/nrow(data)
##
## 0 1
## 0.8812021 0.1187979
if classifying a song as popular when it's score is greater than 50, only ~12% of the data is considered a popular song.
kpop <- dplyr::select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, time_signature, tempo, valence)
General Assumptions: continuous response: popularity score ranging from 0 - 100. mix of categorical and continuous response. the distribution of the variables are not normal, we will check for normality of error terms where appropriate.
Goal: create model for predicting popularity scores
select just audio features
kpop <- select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
kpop0 <- kpop %>% filter(mode == 0)%>% select(-mode)
kpop1 <- kpop %>% filter(mode == 1) %>% select(-mode)
### Kpop mode 0 train and test
smpl.size0 <- floor(0.75*nrow(kpop0))
set.seed(123)
smpl0 <- sample(nrow(kpop0), smpl.size0, replace = FALSE)
train0 <- kpop0[smpl0,]
test0 <- kpop0[-smpl0,]
### Kpop mode 1 train and test
smpl.size1 <- floor(0.75*nrow(kpop1))
set.seed(123)
smpl1 <- sample(nrow(kpop1), smpl.size1, replace = FALSE)
train1 <- kpop1[smpl1,]
test1 <- kpop1[-smpl1,]
fit a multiple linear regression model
ml0 <- lm(popularity ~. , data = train0)
summary(ml0)
##
## Call:
## lm(formula = popularity ~ ., data = train0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.283 -13.161 -1.606 11.489 64.381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.43022 4.99248 19.716 < 2e-16 ***
## duration -5.41104 0.55905 -9.679 < 2e-16 ***
## acousticness -3.81930 2.05303 -1.860 0.062924 .
## danceability -2.19279 3.28543 -0.667 0.504544
## energy -33.97108 3.33505 -10.186 < 2e-16 ***
## instrumentalness -15.67494 4.37625 -3.582 0.000346 ***
## key1 -1.20599 1.49947 -0.804 0.421291
## key2 -2.72991 1.85598 -1.471 0.141416
## key3 3.04084 2.07581 1.465 0.143040
## key4 -4.34609 1.46983 -2.957 0.003129 **
## key5 -0.56490 1.42695 -0.396 0.692217
## key6 1.34681 1.48984 0.904 0.366062
## key7 0.04123 1.63277 0.025 0.979856
## key8 -1.57381 1.76361 -0.892 0.372251
## key9 -0.94547 1.51224 -0.625 0.531872
## key10 -1.75720 1.55635 -1.129 0.258955
## key11 -2.41842 1.38950 -1.740 0.081861 .
## loudness 3.55623 0.20271 17.543 < 2e-16 ***
## speechiness 23.77941 4.74755 5.009 5.75e-07 ***
## tempo -0.00429 0.01255 -0.342 0.732398
## valence -11.88489 1.78863 -6.645 3.51e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.34 on 3483 degrees of freedom
## Multiple R-squared: 0.143, Adjusted R-squared: 0.138
## F-statistic: 29.05 on 20 and 3483 DF, p-value: < 2.2e-16
plot(ml0)
no or little multicollinearity
no autocorrelation
no homoscedasticity.
Try again with tranformation to make popularity normal:(to the squareroot)
ml0.sqrt <- lm(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5))
summary(ml0.sqrt)
##
## Call:
## lm(formula = popularity ~ ., data = train0 %>% mutate(popularity = popularity^0.5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9651 -1.2163 0.1704 1.3715 5.2110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.8026945 0.5397564 23.719 < 2e-16 ***
## duration -0.5857021 0.0604408 -9.691 < 2e-16 ***
## acousticness -0.1418036 0.2219612 -0.639 0.522952
## danceability -0.1381154 0.3552010 -0.389 0.697420
## energy -3.9085950 0.3605651 -10.840 < 2e-16 ***
## instrumentalness -2.2443174 0.4731338 -4.744 2.18e-06 ***
## key1 -0.1306867 0.1621132 -0.806 0.420214
## key2 -0.3754834 0.2006575 -1.871 0.061392 .
## key3 0.1823478 0.2244238 0.813 0.416551
## key4 -0.5438271 0.1589089 -3.422 0.000628 ***
## key5 -0.1190871 0.1542733 -0.772 0.440212
## key6 0.1383520 0.1610728 0.859 0.390433
## key7 -0.0235720 0.1765251 -0.134 0.893779
## key8 -0.1566102 0.1906708 -0.821 0.411495
## key9 -0.1520276 0.1634941 -0.930 0.352505
## key10 -0.2351475 0.1682632 -1.397 0.162353
## key11 -0.2909129 0.1502246 -1.937 0.052885 .
## loudness 0.4225379 0.0219162 19.280 < 2e-16 ***
## speechiness 2.1574574 0.5132766 4.203 2.70e-05 ***
## tempo -0.0007701 0.0013563 -0.568 0.570207
## valence -1.1893401 0.1933758 -6.150 8.60e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.874 on 3483 degrees of freedom
## Multiple R-squared: 0.1594, Adjusted R-squared: 0.1546
## F-statistic: 33.03 on 20 and 3483 DF, p-value: < 2.2e-16
assumption checking and diagnostics
plot(ml0.sqrt)
prediction on test data
# prediction on test data
yhat.mlr = predict(ml0.sqrt, newdata = test0 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test0$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
RMSE = RMSE(yhat.mlr, test0$popularity^0.5),
R2 = R2(yhat.mlr, test0$popularity^0.5)
)
## RMSE R2
## 1 1.863478 0.1332768
Much improved!
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5),
method = "lmStepAIC", trControl = train_control))
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection
##
## 3504 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3153, 3152, 3154, 3154, 3154, 3154, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.878512 0.1528544 1.516643
mlr.step_kcv$finalModel
##
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness +
## key2 + key3 + key4 + key6 + key11 + loudness + speechiness +
## valence, data = dat)
##
## Coefficients:
## (Intercept) duration energy instrumentalness
## 12.4040 -0.5830 -3.8007 -2.2441
## key2 key3 key4 key6
## -0.2588 0.3053 -0.4241 0.2617
## key11 loudness speechiness valence
## -0.1689 0.4214 2.1000 -1.2328
prediction on test data
# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test0) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test0$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
RMSE = RMSE(yhat.step_kcv, test0$popularity^0.5),
R2 = R2(yhat.step_kcv, test0$popularity^0.5)
)
## RMSE R2
## 1 1.861179 0.1350903
Both of the models have RMSE scores of around 2. This isn't terrible since the range of the transformed popularity scores is 0-10.
lm.ridge0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge0$bestTune
## alpha lambda
## 47 0 0.047
# best coefficient
lm.ridge0.model <- coef(lm.ridge0$finalModel, lm.ridge0$bestTune$lambda)
lm.ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 12.3411323068
## duration -0.5762465119
## acousticness -0.0662308488
## danceability -0.1493986374
## energy -3.5601398635
## instrumentalness -2.2691735701
## key1 -0.0831090967
## key2 -0.3241117452
## key3 0.2260133231
## key4 -0.4931505942
## key5 -0.0698858769
## key6 0.1837187046
## key7 0.0173666066
## key8 -0.1059775161
## key9 -0.1084064701
## key10 -0.1844827951
## key11 -0.2447391761
## loudness 0.3990852347
## speechiness 2.0404769494
## tempo -0.0007694543
## valence -1.1748730700
# prediction on test data
yhat.ridge0 = predict(lm.ridge0, s=lm.ridge0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
RMSE = RMSE(yhat.ridge0, test0$popularity^0.5),
R2 = R2(yhat.ridge0, test0$popularity^0.5)
)
## RMSE R2
## 1 1.861093 0.1339601
lm.lasso0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso0$bestTune
## alpha lambda
## 17 1 0.017
# best coefficient
lm.lasso0.model <- coef(lm.lasso0$finalModel, lm.lasso0$bestTune$lambda)
lm.lasso0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 11.92608192
## duration -0.55317885
## acousticness .
## danceability .
## energy -3.50918126
## instrumentalness -2.05824555
## key1 .
## key2 -0.16147902
## key3 0.21698994
## key4 -0.36337170
## key5 .
## key6 0.22354434
## key7 0.03593591
## key8 .
## key9 .
## key10 -0.04408683
## key11 -0.12086307
## loudness 0.39934685
## speechiness 1.77557012
## tempo .
## valence -1.14005299
# prediction on test data
yhat.lasso0 = predict(lm.lasso0, s=lm.lasso0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso0 <- yhat.lasso0 - test0$popularity^0.5
rmse.lasso0 <- sqrt(mean(error.lasso0^2))
data.frame(
RMSE = RMSE(yhat.lasso0, test0$popularity^0.5),
R2 = R2(yhat.lasso0, test0$popularity^0.5)
)
## RMSE R2
## 1 1.858953 0.1351553
lm.enet0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet0$bestTune
# best coefficient
lm.enet0.model <- coef(lm.enet0$finalModel, lm.enet0$bestTune$lambda)
lm.enet0.model
# prediction on test data
yhat.enet0 = predict(lm.enet0, s=lm.enet0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet0 <- yhat.enet0 - test0$popularity^0.5
rmse.enet0 <- sqrt(mean(error.enet0^2))
data.frame(
RMSE = RMSE(yhat.enet0, test0$popularity^0.5),
R2 = R2(yhat.enet0, test0$popularity^0.5)
)
ml1.sqrt <- lm(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5))
summary(ml1.sqrt)
##
## Call:
## lm(formula = popularity ~ ., data = train1 %>% mutate(popularity = popularity^0.5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4223 -1.2067 0.1313 1.3255 5.2394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.857177 0.401243 27.059 < 2e-16 ***
## duration -0.554933 0.044683 -12.419 < 2e-16 ***
## acousticness 0.080560 0.148271 0.543 0.586927
## danceability 0.050885 0.254514 0.200 0.841542
## energy -3.077791 0.280908 -10.957 < 2e-16 ***
## instrumentalness -1.877299 0.352964 -5.319 1.09e-07 ***
## key1 0.321294 0.091623 3.507 0.000457 ***
## key2 0.207166 0.098073 2.112 0.034699 *
## key3 0.430613 0.150398 2.863 0.004210 **
## key4 -0.029116 0.127272 -0.229 0.819058
## key5 0.204278 0.111709 1.829 0.067504 .
## key6 0.216946 0.109298 1.985 0.047206 *
## key7 0.283270 0.091145 3.108 0.001894 **
## key8 0.297098 0.105736 2.810 0.004975 **
## key9 0.270884 0.113464 2.387 0.017001 *
## key10 0.080066 0.132171 0.606 0.544685
## key11 0.389907 0.117678 3.313 0.000928 ***
## loudness 0.386256 0.017387 22.216 < 2e-16 ***
## speechiness 4.579814 0.414965 11.037 < 2e-16 ***
## tempo -0.001474 0.000958 -1.538 0.124055
## valence -0.849286 0.154553 -5.495 4.08e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.821 on 5483 degrees of freedom
## Multiple R-squared: 0.1499, Adjusted R-squared: 0.1468
## F-statistic: 48.35 on 20 and 5483 DF, p-value: < 2.2e-16
prediction on test data
# prediction on test data
yhat.mlr = predict(ml1.sqrt, newdata = test1 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test1$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
RMSE = RMSE(yhat.mlr, test1$popularity^0.5),
R2 = R2(yhat.mlr, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858473 0.1257546
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5),
method = "lmStepAIC", trControl = train_control))
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection
##
## 5504 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4953, 4954, 4954, 4955, 4953, 4953, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.823965 0.1452973 1.468792
mlr.step_kcv$finalModel
##
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness +
## key1 + key2 + key3 + key5 + key6 + key7 + key8 + key9 + key11 +
## loudness + speechiness + tempo + valence, data = dat)
##
## Coefficients:
## (Intercept) duration energy instrumentalness
## 10.984803 -0.555451 -3.160519 -1.884462
## key1 key2 key3 key5
## 0.311835 0.198954 0.425083 0.196319
## key6 key7 key8 key9
## 0.209863 0.274552 0.288519 0.261604
## key11 loudness speechiness tempo
## 0.383154 0.387173 4.578753 -0.001535
## valence
## -0.836556
prediction on test data
# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test1) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test1$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
RMSE = RMSE(yhat.step_kcv, test1$popularity^0.5),
R2 = R2(yhat.step_kcv, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858518 0.1257011
lm.ridge1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge1$bestTune
## alpha lambda
## 45 0 0.045
# best coefficient
lm.ridge1.model <- coef(lm.ridge1$finalModel, lm.ridge1$bestTune$lambda)
lm.ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 10.347839763
## duration -0.543796248
## acousticness 0.169898922
## danceability 0.049633951
## energy -2.620990536
## instrumentalness -1.927744817
## key1 0.293618095
## key2 0.182685249
## key3 0.399431145
## key4 -0.049409156
## key5 0.179350934
## key6 0.188771865
## key7 0.259698734
## key8 0.273385099
## key9 0.249069582
## key10 0.051821098
## key11 0.361138772
## loudness 0.355806469
## speechiness 4.325244621
## tempo -0.001381351
## valence -0.858954968
# prediction on test data
yhat.ridge1 = predict(lm.ridge1, s=lm.ridge1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
RMSE = RMSE(yhat.ridge1, test1$popularity^0.5),
R2 = R2(yhat.ridge1, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858182 0.1253763
lm.lasso1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso1$bestTune
## alpha lambda
## 3 1 0.003
# best coefficient
lm.lasso1.model <- coef(lm.lasso1$finalModel, lm.lasso1$bestTune$lambda)
lm.lasso1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 10.823106069
## duration -0.549904268
## acousticness 0.078797866
## danceability 0.007432375
## energy -3.012459438
## instrumentalness -1.855427319
## key1 0.268514415
## key2 0.153367880
## key3 0.368124062
## key4 -0.060812240
## key5 0.148197878
## key6 0.160460653
## key7 0.231766196
## key8 0.242516531
## key9 0.215811156
## key10 0.020377237
## key11 0.334220735
## loudness 0.381054835
## speechiness 4.504126435
## tempo -0.001394241
## valence -0.829567095
# prediction on test data
yhat.lasso1 = predict(lm.lasso1, s=lm.lasso1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso1 <- yhat.lasso1 - test1$popularity^0.5
rmse.lasso1 <- sqrt(mean(error.lasso1^2))
data.frame(
RMSE = RMSE(yhat.lasso1, test1$popularity^0.5),
R2 = R2(yhat.lasso1, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858393 0.1255584
lm.enet1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet1$bestTune
# best coefficient
lm.enet1.model <- coef(lm.enet1$finalModel, lm.enet1$bestTune$lambda)
lm.enet1.model
# prediction on test data
yhat.enet1 = predict(lm.enet1, s=lm.enet1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet1 <- yhat.enet1 - test1$popularity^0.5
rmse.enet1 <- sqrt(mean(error.enet1^2))
data.frame(
RMSE = RMSE(yhat.enet1, test1$popularity^0.5),
R2 = R2(yhat.enet1, test1$popularity^0.5)
)
Use 70% train, 15% validation, 15% test, to use validation for finding optimal cutoff value.
kpop.logit <- select(data, popular, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
logit.kpop0 <- kpop.logit %>% filter(mode == 0)%>% select(-mode)
logit.kpop1 <- kpop.logit %>% filter(mode == 1) %>% select(-mode)
### Kpop mode 0 train and test
# smpl.size0 <- floor(0.75*nrow(logit.kpop0))
# set.seed(123)
# smpl0 <- sample(nrow(logit.kpop0), smpl.size0, replace = FALSE)
# og.logit.train0 <- logit.kpop0[smpl0,]
# og.logit.test0 <- logit.kpop0[-smpl0,]
set.seed(123)
p3 <- partition.3(logit.kpop0, 0.70, 0.15)
logit.train0 <- p3$data.train
logit.valid0 <- p3$data.val
logit.test0 <- p3$data.test
all.train0 <- rbind(logit.train0, logit.valid0)
# ### Kpop mode 1 train and test
# smpl.size1 <- floor(0.75*nrow(logit.kpop1))
# set.seed(123)
# smpl1 <- sample(nrow(logit.kpop1), smpl.size1, replace = FALSE)
# logit.train1 <- logit.kpop1[smpl1,]
# logit.test1 <- logit.kpop1[-smpl1,]
set.seed(123)
p3 <- partition.3(logit.kpop1, 0.70, 0.15)
logit.train1 <- p3$data.train
logit.valid1 <- p3$data.val
logit.test1 <- p3$data.test
all.train1 <- rbind(logit.train1, logit.valid1)
fitting logistic model using combo of train/valid/test, finding optimal model using training data.
# Fit logistic model on training data
v.logit.model0 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train0)
#search for best cutoff
out0 <- opt.cut.func(v.logit.model0, logit.valid0)
opt.cut.plot(out0)
out0$cutoff[which.min(out0$ssdiff.vec)]
## [1] 0.14
v.opt.cut0 <- out0$cutoff[which.max(out0$kappa.vec)]
v.opt.cut0
## [1] 0.18
Fit final model (combo of train and validation)
v.model.final <- glm(popular ~ ., data=all.train0, family=binomial(link='logit'))
predict on test using 0.5 cutoff
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 606 93
## 1 1 1
##
## Accuracy : 0.8659
## 95% CI : (0.8384, 0.8903)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 0.5275
##
## Kappa : 0.0153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010638
## Specificity : 0.998353
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.866953
## Prevalence : 0.134094
## Detection Rate : 0.001427
## Detection Prevalence : 0.002853
## Balanced Accuracy : 0.504495
##
## 'Positive' Class : 1
##
predict on optimal cutoff (0.18)
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > v.opt.cut0, 1, 0) # using cutoff = 0.18
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 482 67
## 1 125 27
##
## Accuracy : 0.7261
## 95% CI : (0.6915, 0.7588)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0645
##
## Mcnemar's Test P-Value : 3.895e-05
##
## Sensitivity : 0.28723
## Specificity : 0.79407
## Pos Pred Value : 0.17763
## Neg Pred Value : 0.87796
## Prevalence : 0.13409
## Detection Rate : 0.03852
## Detection Prevalence : 0.21683
## Balanced Accuracy : 0.54065
##
## 'Positive' Class : 1
##
Predict on optimal balance between sensitivity and specificity (0.14)
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 383 42
## 1 224 52
##
## Accuracy : 0.6205
## 95% CI : (0.5835, 0.6566)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1013
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.55319
## Specificity : 0.63097
## Pos Pred Value : 0.18841
## Neg Pred Value : 0.90118
## Prevalence : 0.13409
## Detection Rate : 0.07418
## Detection Prevalence : 0.39372
## Balanced Accuracy : 0.59208
##
## 'Positive' Class : 1
##
step-wise 10 fold cross validation
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
# Fit K-fold CV model
meh <- capture.output(step0_kcv <- train(popular ~ ., data = logit.train0, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step0_kcv <- step0_kcv$finalModel
step0_kcv
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness energy
## 5.2007 -0.5129 -1.4480 -3.7363
## instrumentalness key3 key4 loudness
## -2.3933 0.7328 -0.3737 0.3399
## speechiness valence
## 3.1716 -1.4940
##
## Degrees of Freedom: 3270 Total (i.e. Null); 3261 Residual
## Null Deviance: 2609
## Residual Deviance: 2453 AIC: 2473
kcv.prob.test0 <- predict(step0_kcv, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
Find optimal cut off value:
#search for best cutoff
kcv.out0 <- opt.cut.func(step0_kcv, key.dummy(logit.valid0))
opt.cut.plot(kcv.out0)
kcv.out0$cutoff[which.min(kcv.out0$ssdiff.vec)]
## [1] 0.14
kcv.out0.cut0 <- kcv.out0$cutoff[which.max(kcv.out0$kappa.vec)]
kcv.out0.cut0
## [1] 0.17
Fit final model (combo of train and validation)
set.seed(123)
finalmeh <- capture.output(step0_kcv.final0 <- train(popular ~ ., data = all.train0, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step0_kcv.final0 <- step0_kcv.final0$finalModel
step0_kcv.final0
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness energy
## 5.3767 -0.5256 -1.5256 -3.8603
## instrumentalness key3 key4 loudness
## -2.7390 0.6388 -0.4434 0.3518
## speechiness valence
## 3.3281 -1.5226
##
## Degrees of Freedom: 3971 Total (i.e. Null); 3962 Residual
## Null Deviance: 3102
## Residual Deviance: 2908 AIC: 2928
predict on test using 0.5 cutoff
kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 606 93
## 1 1 1
##
## Accuracy : 0.8659
## 95% CI : (0.8384, 0.8903)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 0.5275
##
## Kappa : 0.0153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010638
## Specificity : 0.998353
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.866953
## Prevalence : 0.134094
## Detection Rate : 0.001427
## Detection Prevalence : 0.002853
## Balanced Accuracy : 0.504495
##
## 'Positive' Class : 1
##
predict on test using optimal kappa, 0.17 cutoff
kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.17, 1, 0) # using cutoff = 0.17
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 475 65
## 1 132 29
##
## Accuracy : 0.719
## 95% CI : (0.6841, 0.752)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.07
##
## Mcnemar's Test P-Value : 2.572e-06
##
## Sensitivity : 0.30851
## Specificity : 0.78254
## Pos Pred Value : 0.18012
## Neg Pred Value : 0.87963
## Prevalence : 0.13409
## Detection Rate : 0.04137
## Detection Prevalence : 0.22967
## Balanced Accuracy : 0.54552
##
## 'Positive' Class : 1
##
predict on test using 0.14 cutoff, optimal balance between sensitivity and specificity
kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.14, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 397 45
## 1 210 49
##
## Accuracy : 0.6362
## 95% CI : (0.5994, 0.6719)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1007
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5213
## Specificity : 0.6540
## Pos Pred Value : 0.1892
## Neg Pred Value : 0.8982
## Prevalence : 0.1341
## Detection Rate : 0.0699
## Detection Prevalence : 0.3695
## Balanced Accuracy : 0.5877
##
## 'Positive' Class : 1
##
set.seed(123)
glmulti.out0 <- glmulti(popular ~ ., data = logit.train0,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out0@formulas
view summary of top model
summary(glmulti.out0@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3900 -0.5880 -0.4766 -0.3530 2.4475
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.2221 0.7349 7.106 1.19e-12 ***
## duration -0.5083 0.1054 -4.822 1.42e-06 ***
## acousticness -1.4563 0.3749 -3.885 0.000102 ***
## energy -3.8200 0.5976 -6.393 1.63e-10 ***
## instrumentalness -2.3856 1.8267 -1.306 0.191582
## loudness 0.3438 0.0457 7.522 5.39e-14 ***
## speechiness 3.0901 0.7440 4.153 3.28e-05 ***
## valence -1.4181 0.2775 -5.111 3.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2609.2 on 3270 degrees of freedom
## Residual deviance: 2465.1 on 3263 degrees of freedom
## AIC: 2481.1
##
## Number of Fisher Scoring iterations: 6
Store model
allreg.logit0 <- glmulti.out0@objects[[1]]
#search for best cutoff
allreg.out0 <- opt.cut.func(allreg.logit0, logit.valid0)
opt.cut.plot(allreg.out0)
allreg.out0$cutoff[which.min(allreg.out0$ssdiff.vec)]
## [1] 0.15
allreg.out0.cut0 <- allreg.out0$cutoff[which.max(allreg.out0$kappa.vec)]
allreg.out0.cut0
## [1] 0.14
fit final model to combo of training and validation
set.seed(123)
glmulti.out0 <- glmulti(popular ~ ., data = all.train0,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out1@formulas
summary(glmulti.out0@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4130 -0.5781 -0.4653 -0.3432 2.4901
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.36929 0.67847 7.914 2.50e-15 ***
## duration -0.51557 0.09501 -5.426 5.75e-08 ***
## acousticness -1.53724 0.34555 -4.449 8.64e-06 ***
## energy -3.95482 0.55281 -7.154 8.43e-13 ***
## instrumentalness -2.72227 1.84991 -1.472 0.141
## loudness 0.35437 0.04254 8.330 < 2e-16 ***
## speechiness 3.27806 0.67787 4.836 1.33e-06 ***
## valence -1.44487 0.25585 -5.647 1.63e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3102.1 on 3971 degrees of freedom
## Residual deviance: 2922.2 on 3964 degrees of freedom
## AIC: 2938.2
##
## Number of Fisher Scoring iterations: 6
store final model
allreg.logit0.final <- glmulti.out0@objects[[1]]
Predictions with 0.5 as the cut off
allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 606 93
## 1 1 1
##
## Accuracy : 0.8659
## 95% CI : (0.8384, 0.8903)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 0.5275
##
## Kappa : 0.0153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010638
## Specificity : 0.998353
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.866953
## Prevalence : 0.134094
## Detection Rate : 0.001427
## Detection Prevalence : 0.002853
## Balanced Accuracy : 0.504495
##
## 'Positive' Class : 1
##
Predictions where cut off yields the best kappa, 0.14
allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > allreg.out0.cut0, 1, 0) # using cutoff 0.14
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 404 42
## 1 203 52
##
## Accuracy : 0.6505
## 95% CI : (0.6139, 0.6858)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1269
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.55319
## Specificity : 0.66557
## Pos Pred Value : 0.20392
## Neg Pred Value : 0.90583
## Prevalence : 0.13409
## Detection Rate : 0.07418
## Detection Prevalence : 0.36377
## Balanced Accuracy : 0.60938
##
## 'Positive' Class : 1
##
Predictions where cut off is the best balance of sensitivity and specificity, 0.15
allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.15, 1, 0) # using cutoff 0.15
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 435 51
## 1 172 43
##
## Accuracy : 0.6819
## 95% CI : (0.646, 0.7162)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1128
##
## Mcnemar's Test P-Value : 9.297e-16
##
## Sensitivity : 0.45745
## Specificity : 0.71664
## Pos Pred Value : 0.20000
## Neg Pred Value : 0.89506
## Prevalence : 0.13409
## Detection Rate : 0.06134
## Detection Prevalence : 0.30670
## Balanced Accuracy : 0.58704
##
## 'Positive' Class : 1
##
set.seed(123)
ridge0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
## alpha lambda
## 100 0 0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.3812536494
## duration -0.2559119522
## acousticness -0.3997989337
## danceability -0.1441080290
## energy -0.6313909327
## instrumentalness -0.8210624307
## key1 -0.0140152818
## key2 -0.1557323229
## key3 0.4654206935
## key4 -0.1797053979
## key5 0.0291019833
## key6 0.0724378741
## key7 0.0623824309
## key8 -0.0095258891
## key9 0.0216445665
## key10 0.0301392242
## key11 -0.0311864381
## loudness 0.0852587410
## speechiness 1.4727602887
## tempo 0.0001278062
## valence -0.7066431211
Search for best cutoff using validation set
ridge0.out <- reg.opt.cut.func(ridge0, logit.valid0)
opt.cut.plot(ridge0.out)
# cut off by kappa
ridge0.out$cutoff[which.max(ridge0.out$kappa.vec)]
## [1] 0.15
ridge0.out$cutoff[which.min(ridge0.out$ssdiff.vec)]
## [1] 0.14
create final model
set.seed(123)
ridge0 <- train(popular ~ ., data = all.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
## alpha lambda
## 100 0 0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.321762194
## duration -0.257359810
## acousticness -0.409439651
## danceability -0.062528153
## energy -0.607499947
## instrumentalness -0.839357369
## key1 0.013386505
## key2 -0.091231783
## key3 0.400527208
## key4 -0.198298922
## key5 0.053748073
## key6 0.142443327
## key7 0.036589828
## key8 0.030302380
## key9 -0.041870877
## key10 0.034094167
## key11 -0.034462056
## loudness 0.084161023
## speechiness 1.615890923
## tempo -0.000348579
## valence -0.724448093
predict and evaluate on the test set where cutoff is at 0.5
prob.ridge0 <- predict(ridge0, s = ridge0$bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 607 94
## 1 0 0
##
## Accuracy : 0.8659
## 95% CI : (0.8384, 0.8903)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 0.5275
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8659
## Prevalence : 0.1341
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.15 corresponding to optimal kappa
prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.15, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 464 63
## 1 143 31
##
## Accuracy : 0.7061
## 95% CI : (0.6709, 0.7396)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0693
##
## Mcnemar's Test P-Value : 3.709e-08
##
## Sensitivity : 0.32979
## Specificity : 0.76442
## Pos Pred Value : 0.17816
## Neg Pred Value : 0.88046
## Prevalence : 0.13409
## Detection Rate : 0.04422
## Detection Prevalence : 0.24822
## Balanced Accuracy : 0.54710
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal sensitivity and specificity balance
prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 403 48
## 1 204 46
##
## Accuracy : 0.6405
## 95% CI : (0.6037, 0.6761)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0901
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.48936
## Specificity : 0.66392
## Pos Pred Value : 0.18400
## Neg Pred Value : 0.89357
## Prevalence : 0.13409
## Detection Rate : 0.06562
## Detection Prevalence : 0.35663
## Balanced Accuracy : 0.57664
##
## 'Positive' Class : 1
##
set.seed(123)
enet0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.3812536494
## duration -0.2559119522
## acousticness -0.3997989337
## danceability -0.1441080290
## energy -0.6313909327
## instrumentalness -0.8210624307
## key1 -0.0140152818
## key2 -0.1557323229
## key3 0.4654206935
## key4 -0.1797053979
## key5 0.0291019833
## key6 0.0724378741
## key7 0.0623824309
## key8 -0.0095258891
## key9 0.0216445665
## key10 0.0301392242
## key11 -0.0311864381
## loudness 0.0852587410
## speechiness 1.4727602887
## tempo 0.0001278062
## valence -0.7066431211
search for best cutoff with validation set
enet0.out <- reg.opt.cut.func(enet0, logit.valid0)
opt.cut.plot(enet0.out)
# cut off by kappa
enet0.out$cutoff[which.max(enet0.out$kappa.vec)]
## [1] 0.15
enet0.out$cutoff[which.min(enet0.out$ssdiff.vec)]
## [1] 0.14
create final model on train and validation
set.seed(123)
enet0 <- train(popular ~ ., data = all.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.321762194
## duration -0.257359810
## acousticness -0.409439651
## danceability -0.062528153
## energy -0.607499947
## instrumentalness -0.839357369
## key1 0.013386505
## key2 -0.091231783
## key3 0.400527208
## key4 -0.198298922
## key5 0.053748073
## key6 0.142443327
## key7 0.036589828
## key8 0.030302380
## key9 -0.041870877
## key10 0.034094167
## key11 -0.034462056
## loudness 0.084161023
## speechiness 1.615890923
## tempo -0.000348579
## valence -0.724448093
predict and evaluate on the test set where cutoff is at 0.5
prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 607 94
## 1 0 0
##
## Accuracy : 0.8659
## 95% CI : (0.8384, 0.8903)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 0.5275
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8659
## Prevalence : 0.1341
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.15 corresponding to optimal kappa
prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.15, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 464 63
## 1 143 31
##
## Accuracy : 0.7061
## 95% CI : (0.6709, 0.7396)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0693
##
## Mcnemar's Test P-Value : 3.709e-08
##
## Sensitivity : 0.32979
## Specificity : 0.76442
## Pos Pred Value : 0.17816
## Neg Pred Value : 0.88046
## Prevalence : 0.13409
## Detection Rate : 0.04422
## Detection Prevalence : 0.24822
## Balanced Accuracy : 0.54710
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal balance of sensitivity and specificity
prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 403 48
## 1 204 46
##
## Accuracy : 0.6405
## 95% CI : (0.6037, 0.6761)
## No Information Rate : 0.8659
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0901
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.48936
## Specificity : 0.66392
## Pos Pred Value : 0.18400
## Neg Pred Value : 0.89357
## Prevalence : 0.13409
## Detection Rate : 0.06562
## Detection Prevalence : 0.35663
## Balanced Accuracy : 0.57664
##
## 'Positive' Class : 1
##
fitting logistic model using combo of train/valid/test, finding optimal model using training data.
# Fit logistic model on training data
v.logit.model1 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train1)
#search for best cutoff
out1 <- opt.cut.func(v.logit.model1, logit.valid1)
opt.cut.plot(out1)
out1$cutoff[which.min(out1$ssdiff.vec)]
## [1] 0.11
v.opt.cut1 <- out1$cutoff[which.max(out1$kappa.vec)]
v.opt.cut1
## [1] 0.15
Fit final model (combo of train and validation)
v.model.final1 <- glm(popular ~ ., data=all.train1, family=binomial(link='logit'))
predict on test using 0.5 cutoff
v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 987 113
## 1 1 0
##
## Accuracy : 0.8965
## 95% CI : (0.8769, 0.9138)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 0.5643
##
## Kappa : -0.0018
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000000
## Specificity : 0.9989879
## Pos Pred Value : 0.0000000
## Neg Pred Value : 0.8972727
## Prevalence : 0.1026340
## Detection Rate : 0.0000000
## Detection Prevalence : 0.0009083
## Balanced Accuracy : 0.4994939
##
## 'Positive' Class : 1
##
predict on test using optimal cutoff (kappa), 0.15
v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > v.opt.cut1, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 817 72
## 1 171 41
##
## Accuracy : 0.7793
## 95% CI : (0.7536, 0.8035)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1367
##
## Mcnemar's Test P-Value : 3.243e-10
##
## Sensitivity : 0.36283
## Specificity : 0.82692
## Pos Pred Value : 0.19340
## Neg Pred Value : 0.91901
## Prevalence : 0.10263
## Detection Rate : 0.03724
## Detection Prevalence : 0.19255
## Balanced Accuracy : 0.59488
##
## 'Positive' Class : 1
##
predict on test using optimal cutoff (sensitivity and specificity), 0.11
v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.11, 1, 0) # using cutoff = 0.11
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 609 43
## 1 379 70
##
## Accuracy : 0.6167
## 95% CI : (0.5873, 0.6455)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1018
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.61947
## Specificity : 0.61640
## Pos Pred Value : 0.15590
## Neg Pred Value : 0.93405
## Prevalence : 0.10263
## Detection Rate : 0.06358
## Detection Prevalence : 0.40781
## Balanced Accuracy : 0.61793
##
## 'Positive' Class : 1
##
step-wise 10 fold cross validation
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
# Fit K-fold CV model
meh <- capture.output(step1_kcv <- train(popular ~ ., data = logit.train1, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv <- step1_kcv$finalModel
step1_kcv
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness energy
## 4.0735 -0.4712 -0.8449 -3.3814
## instrumentalness key3 key4 key10
## -13.0994 0.6327 -0.5020 -0.3727
## loudness speechiness valence
## 0.3220 5.0849 -1.2362
##
## Degrees of Freedom: 5136 Total (i.e. Null); 5126 Residual
## Null Deviance: 3568
## Residual Deviance: 3357 AIC: 3379
#search for best cutoff
kcv.out1 <- opt.cut.func(step1_kcv, key.dummy(logit.valid1))
opt.cut.plot(kcv.out1)
kcv.out1$cutoff[which.min(kcv.out1$ssdiff.vec)]
## [1] 0.11
kcv.out1.cut1 <- kcv.out1$cutoff[which.max(kcv.out1$kappa.vec)]
kcv.out1.cut1
## [1] 0.13
Fit final model (combo of train and validation)
set.seed(123)
finalmeh <- capture.output(step1_kcv.final <- train(popular ~ ., data = all.train1, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv.final <- step1_kcv.final$finalModel
step1_kcv.final
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness danceability
## 3.279565 -0.464924 -0.777317 0.976056
## energy instrumentalness key3 key8
## -3.477833 -12.106204 0.512832 0.267298
## key10 loudness speechiness tempo
## -0.459592 0.333033 4.790242 0.002819
## valence
## -1.411113
##
## Degrees of Freedom: 6237 Total (i.e. Null); 6225 Residual
## Null Deviance: 4360
## Residual Deviance: 4095 AIC: 4121
predict on test using 0.5 cutoff
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 988 112
## 1 0 1
##
## Accuracy : 0.8983
## 95% CI : (0.8789, 0.9155)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 0.4854
##
## Kappa : 0.0158
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0088496
## Specificity : 1.0000000
## Pos Pred Value : 1.0000000
## Neg Pred Value : 0.8981818
## Prevalence : 0.1026340
## Detection Rate : 0.0009083
## Detection Prevalence : 0.0009083
## Balanced Accuracy : 0.5044248
##
## 'Positive' Class : 1
##
Using cutoff 0.13 which yields optimal kappa
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > kcv.out1.cut1, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 723 56
## 1 265 57
##
## Accuracy : 0.7084
## 95% CI : (0.6806, 0.7352)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1299
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.50442
## Specificity : 0.73178
## Pos Pred Value : 0.17702
## Neg Pred Value : 0.92811
## Prevalence : 0.10263
## Detection Rate : 0.05177
## Detection Prevalence : 0.29246
## Balanced Accuracy : 0.61810
##
## 'Positive' Class : 1
##
Using cut off 0.11 which yields the best balance between sensitivity and specificity.
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.11, 1, 0) # using cutoff = 0.11
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 594 45
## 1 394 68
##
## Accuracy : 0.6013
## 95% CI : (0.5717, 0.6303)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0857
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.60177
## Specificity : 0.60121
## Pos Pred Value : 0.14719
## Neg Pred Value : 0.92958
## Prevalence : 0.10263
## Detection Rate : 0.06176
## Detection Prevalence : 0.41962
## Balanced Accuracy : 0.60149
##
## 'Positive' Class : 1
##
set.seed(123)
glmulti.out1 <- glmulti(popular ~ ., data = logit.train1,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out1@formulas
view summary of top model
summary(glmulti.out1@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8240 -0.5209 -0.4300 -0.3327 2.7606
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.01931 0.62836 6.397 1.59e-10 ***
## duration -0.46099 0.08964 -5.143 2.71e-07 ***
## acousticness -0.83197 0.27043 -3.077 0.00209 **
## energy -3.38962 0.52995 -6.396 1.59e-10 ***
## instrumentalness -13.17589 9.02825 -1.459 0.14445
## loudness 0.32058 0.03853 8.321 < 2e-16 ***
## speechiness 5.09477 0.63162 8.066 7.25e-16 ***
## valence -1.22718 0.24729 -4.963 6.96e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3568.2 on 5136 degrees of freedom
## Residual deviance: 3372.1 on 5129 degrees of freedom
## AIC: 3388.1
##
## Number of Fisher Scoring iterations: 9
Store model
allreg.logit1 <- glmulti.out1@objects[[1]]
#search for best cutoff
allreg.out1 <- opt.cut.func(allreg.logit1, logit.valid1)
opt.cut.plot(allreg.out1)
allreg.out1$cutoff[which.min(allreg.out1$ssdiff.vec)]
## [1] 0.11
allreg.out1.cut1 <- allreg.out1$cutoff[which.max(allreg.out1$kappa.vec)]
allreg.out1.cut1
## [1] 0.16
fit final model to combo of training and validation
set.seed(123)
glmulti.out1 <- glmulti(popular ~ ., data = all.train1,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out1@formulas
summary(glmulti.out1@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6717 -0.5243 -0.4296 -0.3299 2.7251
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.289296 0.692110 4.753 2.01e-06 ***
## duration -0.460833 0.081938 -5.624 1.86e-08 ***
## acousticness -0.750350 0.249223 -3.011 0.00261 **
## danceability 0.942373 0.420840 2.239 0.02514 *
## energy -3.483207 0.478705 -7.276 3.43e-13 ***
## instrumentalness -11.929151 7.266196 -1.642 0.10065
## loudness 0.333199 0.035260 9.450 < 2e-16 ***
## speechiness 4.822191 0.576504 8.365 < 2e-16 ***
## tempo 0.002978 0.001572 1.894 0.05822 .
## valence -1.405615 0.249178 -5.641 1.69e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4359.9 on 6237 degrees of freedom
## Residual deviance: 4107.6 on 6228 degrees of freedom
## AIC: 4127.6
##
## Number of Fisher Scoring iterations: 9
store final model
allreg.logit1.final <- glmulti.out1@objects[[1]]
Predictions with 0.5 as the cut off
allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 988 112
## 1 0 1
##
## Accuracy : 0.8983
## 95% CI : (0.8789, 0.9155)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 0.4854
##
## Kappa : 0.0158
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0088496
## Specificity : 1.0000000
## Pos Pred Value : 1.0000000
## Neg Pred Value : 0.8981818
## Prevalence : 0.1026340
## Detection Rate : 0.0009083
## Detection Prevalence : 0.0009083
## Balanced Accuracy : 0.5044248
##
## 'Positive' Class : 1
##
Predictions where cut off is the best kappa, 0.16
allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > allreg.out1.cut1, 1, 0) # using optimal cutoff 0.16
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 841 77
## 1 147 36
##
## Accuracy : 0.7965
## 95% CI : (0.7715, 0.82)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1332
##
## Mcnemar's Test P-Value : 4.022e-06
##
## Sensitivity : 0.3186
## Specificity : 0.8512
## Pos Pred Value : 0.1967
## Neg Pred Value : 0.9161
## Prevalence : 0.1026
## Detection Rate : 0.0327
## Detection Prevalence : 0.1662
## Balanced Accuracy : 0.5849
##
## 'Positive' Class : 1
##
Predictions where cut off is the best balance of sensitivity and specificity, 0.11
allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.11, 1, 0) # using optimal cutoff 0.11
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 594 44
## 1 394 69
##
## Accuracy : 0.6022
## 95% CI : (0.5726, 0.6312)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0893
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.61062
## Specificity : 0.60121
## Pos Pred Value : 0.14903
## Neg Pred Value : 0.93103
## Prevalence : 0.10263
## Detection Rate : 0.06267
## Detection Prevalence : 0.42053
## Balanced Accuracy : 0.60592
##
## 'Positive' Class : 1
##
set.seed(123)
ridge1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
## alpha lambda
## 100 0 0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.940884985
## duration -0.198157468
## acousticness -0.107972712
## danceability -0.008980677
## energy -0.260064046
## instrumentalness -0.827713363
## key1 0.046091952
## key2 0.008592692
## key3 0.349398913
## key4 -0.189817749
## key5 0.003945898
## key6 -0.027278685
## key7 0.091414861
## key8 0.090088954
## key9 0.089685627
## key10 -0.163351622
## key11 -0.024566167
## loudness 0.062462753
## speechiness 2.309752435
## tempo 0.001341193
## valence -0.487758435
Search for best cutoff using validation set
ridge1.out <- reg.opt.cut.func(ridge1, logit.valid1)
opt.cut.plot(ridge1.out)
# cut off by kappa
ridge1.out$cutoff[which.max(ridge1.out$kappa.vec)]
## [1] 0.12
ridge1.out$cutoff[which.min(ridge1.out$ssdiff.vec)]
## [1] 0.11
create final model
set.seed(123)
ridge1 <- train(popular ~ ., data = all.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
## alpha lambda
## 100 0 0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.031119627
## duration -0.202122239
## acousticness -0.103141402
## danceability 0.159103462
## energy -0.270068249
## instrumentalness -0.817245806
## key1 0.026882836
## key2 0.007008190
## key3 0.266050136
## key4 -0.114394579
## key5 -0.025139220
## key6 -0.032287416
## key7 0.078237033
## key8 0.138366721
## key9 0.130311371
## key10 -0.214434940
## key11 -0.063822095
## loudness 0.064513136
## speechiness 2.177666732
## tempo 0.001583618
## valence -0.474359450
predict and evaluate on the test set where cutoff is at 0.5
prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 988 113
## 1 0 0
##
## Accuracy : 0.8974
## 95% CI : (0.8779, 0.9147)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 0.525
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8974
## Prevalence : 0.1026
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa
prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.12, 1, 0) # using cutoff = 0.12
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 703 60
## 1 285 53
##
## Accuracy : 0.6866
## 95% CI : (0.6583, 0.714)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.096
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.46903
## Specificity : 0.71154
## Pos Pred Value : 0.15680
## Neg Pred Value : 0.92136
## Prevalence : 0.10263
## Detection Rate : 0.04814
## Detection Prevalence : 0.30699
## Balanced Accuracy : 0.59028
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity
prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 557 36
## 1 431 77
##
## Accuracy : 0.5758
## 95% CI : (0.546, 0.6053)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0962
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.68142
## Specificity : 0.56377
## Pos Pred Value : 0.15157
## Neg Pred Value : 0.93929
## Prevalence : 0.10263
## Detection Rate : 0.06994
## Detection Prevalence : 0.46140
## Balanced Accuracy : 0.62259
##
## 'Positive' Class : 1
##
lasso1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lasso1$bestTune
# best coefficient
lasso1.model <- coef(lasso1$finalModel, lasso1$bestTune$lambda)
lasso1.model
Search for best cutoff using validation set
lasso1.out <- reg.opt.cut.func(lasso1, logit.valid1)
opt.cut.plot(lasso1.out)
# cut off by kappa
lasso1.out$cutoff[which.max(lasso1.out$kappa.vec)]
lasso1.out$cutoff[which.min(lasso1.out$ssdiff.vec)]
set.seed(123)
enet1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.940884985
## duration -0.198157468
## acousticness -0.107972712
## danceability -0.008980677
## energy -0.260064046
## instrumentalness -0.827713363
## key1 0.046091952
## key2 0.008592692
## key3 0.349398913
## key4 -0.189817749
## key5 0.003945898
## key6 -0.027278685
## key7 0.091414861
## key8 0.090088954
## key9 0.089685627
## key10 -0.163351622
## key11 -0.024566167
## loudness 0.062462753
## speechiness 2.309752435
## tempo 0.001341193
## valence -0.487758435
search for best cutoff with validation set
enet1.out <- reg.opt.cut.func(enet1, logit.valid1)
opt.cut.plot(enet1.out)
# cut off by kappa
enet1.out$cutoff[which.max(enet1.out$kappa.vec)]
## [1] 0.12
enet1.out$cutoff[which.min(enet1.out$ssdiff.vec)]
## [1] 0.11
create final model on train and validation
set.seed(123)
enet1 <- train(popular ~ ., data = all.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.031119627
## duration -0.202122239
## acousticness -0.103141402
## danceability 0.159103462
## energy -0.270068249
## instrumentalness -0.817245806
## key1 0.026882836
## key2 0.007008190
## key3 0.266050136
## key4 -0.114394579
## key5 -0.025139220
## key6 -0.032287416
## key7 0.078237033
## key8 0.138366721
## key9 0.130311371
## key10 -0.214434940
## key11 -0.063822095
## loudness 0.064513136
## speechiness 2.177666732
## tempo 0.001583618
## valence -0.474359450
predict and evaluate on the test set where cutoff is at 0.5
prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 988 113
## 1 0 0
##
## Accuracy : 0.8974
## 95% CI : (0.8779, 0.9147)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 0.525
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8974
## Prevalence : 0.1026
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa
prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.12, 1, 0) # using cutoff = 0.12
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 703 60
## 1 285 53
##
## Accuracy : 0.6866
## 95% CI : (0.6583, 0.714)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.096
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.46903
## Specificity : 0.71154
## Pos Pred Value : 0.15680
## Neg Pred Value : 0.92136
## Prevalence : 0.10263
## Detection Rate : 0.04814
## Detection Prevalence : 0.30699
## Balanced Accuracy : 0.59028
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity
prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 557 36
## 1 431 77
##
## Accuracy : 0.5758
## 95% CI : (0.546, 0.6053)
## No Information Rate : 0.8974
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0962
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.68142
## Specificity : 0.56377
## Pos Pred Value : 0.15157
## Neg Pred Value : 0.93929
## Prevalence : 0.10263
## Detection Rate : 0.06994
## Detection Prevalence : 0.46140
## Balanced Accuracy : 0.62259
##
## 'Positive' Class : 1
##